{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 1.获取数据\n",
    "# 2.数据基本处理\n",
    "# 2.1 确定特征值,目标值\n",
    "# 2.2 缺失值处理\n",
    "# 2.3 数据集划分\n",
    "# 3.特征工程(字典特征抽取)\n",
    "# 4.机器学习(决策树)\n",
    "# 5.模型评估"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.model_selection import train_test_split, GridSearchCV\n",
    "from sklearn.feature_extraction import DictVectorizer\n",
    "# from sklearn.tree import DecisionTreeClassifier, export_graphviz\n",
    "from sklearn.ensemble import RandomForestClassifier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 1.获取数据\n",
    "data = pd.read_csv(\"http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>row.names</th>\n",
       "      <th>pclass</th>\n",
       "      <th>survived</th>\n",
       "      <th>name</th>\n",
       "      <th>age</th>\n",
       "      <th>embarked</th>\n",
       "      <th>home.dest</th>\n",
       "      <th>room</th>\n",
       "      <th>ticket</th>\n",
       "      <th>boat</th>\n",
       "      <th>sex</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>1st</td>\n",
       "      <td>1</td>\n",
       "      <td>Allen, Miss Elisabeth Walton</td>\n",
       "      <td>29.0000</td>\n",
       "      <td>Southampton</td>\n",
       "      <td>St Louis, MO</td>\n",
       "      <td>B-5</td>\n",
       "      <td>24160 L221</td>\n",
       "      <td>2</td>\n",
       "      <td>female</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>1st</td>\n",
       "      <td>0</td>\n",
       "      <td>Allison, Miss Helen Loraine</td>\n",
       "      <td>2.0000</td>\n",
       "      <td>Southampton</td>\n",
       "      <td>Montreal, PQ / Chesterville, ON</td>\n",
       "      <td>C26</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>female</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>1st</td>\n",
       "      <td>0</td>\n",
       "      <td>Allison, Mr Hudson Joshua Creighton</td>\n",
       "      <td>30.0000</td>\n",
       "      <td>Southampton</td>\n",
       "      <td>Montreal, PQ / Chesterville, ON</td>\n",
       "      <td>C26</td>\n",
       "      <td>NaN</td>\n",
       "      <td>(135)</td>\n",
       "      <td>male</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>1st</td>\n",
       "      <td>0</td>\n",
       "      <td>Allison, Mrs Hudson J.C. (Bessie Waldo Daniels)</td>\n",
       "      <td>25.0000</td>\n",
       "      <td>Southampton</td>\n",
       "      <td>Montreal, PQ / Chesterville, ON</td>\n",
       "      <td>C26</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>female</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>1st</td>\n",
       "      <td>1</td>\n",
       "      <td>Allison, Master Hudson Trevor</td>\n",
       "      <td>0.9167</td>\n",
       "      <td>Southampton</td>\n",
       "      <td>Montreal, PQ / Chesterville, ON</td>\n",
       "      <td>C22</td>\n",
       "      <td>NaN</td>\n",
       "      <td>11</td>\n",
       "      <td>male</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   row.names pclass  survived  \\\n",
       "0          1    1st         1   \n",
       "1          2    1st         0   \n",
       "2          3    1st         0   \n",
       "3          4    1st         0   \n",
       "4          5    1st         1   \n",
       "\n",
       "                                              name      age     embarked  \\\n",
       "0                     Allen, Miss Elisabeth Walton  29.0000  Southampton   \n",
       "1                      Allison, Miss Helen Loraine   2.0000  Southampton   \n",
       "2              Allison, Mr Hudson Joshua Creighton  30.0000  Southampton   \n",
       "3  Allison, Mrs Hudson J.C. (Bessie Waldo Daniels)  25.0000  Southampton   \n",
       "4                    Allison, Master Hudson Trevor   0.9167  Southampton   \n",
       "\n",
       "                         home.dest room      ticket   boat     sex  \n",
       "0                     St Louis, MO  B-5  24160 L221      2  female  \n",
       "1  Montreal, PQ / Chesterville, ON  C26         NaN    NaN  female  \n",
       "2  Montreal, PQ / Chesterville, ON  C26         NaN  (135)    male  \n",
       "3  Montreal, PQ / Chesterville, ON  C26         NaN    NaN  female  \n",
       "4  Montreal, PQ / Chesterville, ON  C22         NaN     11    male  "
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 2.数据基本处理\n",
    "# 2.1 确定特征值,目标值\n",
    "x = data[['pclass', 'age', 'sex']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    1\n",
       "1    0\n",
       "2    0\n",
       "3    0\n",
       "4    1\n",
       "Name: survived, dtype: int64"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y = data['survived']\n",
    "y.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "f:\\virtual_environment\\ai\\lib\\site-packages\\pandas\\core\\series.py:4523: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  downcast=downcast,\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pclass</th>\n",
       "      <th>age</th>\n",
       "      <th>sex</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1st</td>\n",
       "      <td>29.0000</td>\n",
       "      <td>female</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1st</td>\n",
       "      <td>2.0000</td>\n",
       "      <td>female</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1st</td>\n",
       "      <td>30.0000</td>\n",
       "      <td>male</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1st</td>\n",
       "      <td>25.0000</td>\n",
       "      <td>female</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1st</td>\n",
       "      <td>0.9167</td>\n",
       "      <td>male</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  pclass      age     sex\n",
       "0    1st  29.0000  female\n",
       "1    1st   2.0000  female\n",
       "2    1st  30.0000    male\n",
       "3    1st  25.0000  female\n",
       "4    1st   0.9167    male"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 2.2 缺失值处理\n",
    "x['age'].fillna(value=x['age'].mean(), inplace=True)\n",
    "x.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pclass</th>\n",
       "      <th>age</th>\n",
       "      <th>sex</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>649</th>\n",
       "      <td>3rd</td>\n",
       "      <td>45.000000</td>\n",
       "      <td>female</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1078</th>\n",
       "      <td>3rd</td>\n",
       "      <td>31.194181</td>\n",
       "      <td>male</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>59</th>\n",
       "      <td>1st</td>\n",
       "      <td>31.194181</td>\n",
       "      <td>female</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>201</th>\n",
       "      <td>1st</td>\n",
       "      <td>18.000000</td>\n",
       "      <td>male</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>61</th>\n",
       "      <td>1st</td>\n",
       "      <td>31.194181</td>\n",
       "      <td>female</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>356</th>\n",
       "      <td>2nd</td>\n",
       "      <td>26.000000</td>\n",
       "      <td>male</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>960</th>\n",
       "      <td>3rd</td>\n",
       "      <td>31.194181</td>\n",
       "      <td>male</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>812</th>\n",
       "      <td>3rd</td>\n",
       "      <td>31.194181</td>\n",
       "      <td>male</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>132</th>\n",
       "      <td>1st</td>\n",
       "      <td>36.000000</td>\n",
       "      <td>female</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>885</th>\n",
       "      <td>3rd</td>\n",
       "      <td>31.194181</td>\n",
       "      <td>male</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1050 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     pclass        age     sex\n",
       "649     3rd  45.000000  female\n",
       "1078    3rd  31.194181    male\n",
       "59      1st  31.194181  female\n",
       "201     1st  18.000000    male\n",
       "61      1st  31.194181  female\n",
       "...     ...        ...     ...\n",
       "356     2nd  26.000000    male\n",
       "960     3rd  31.194181    male\n",
       "812     3rd  31.194181    male\n",
       "132     1st  36.000000  female\n",
       "885     3rd  31.194181    male\n",
       "\n",
       "[1050 rows x 3 columns]"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 2.3 数据集划分\n",
    "x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=22, test_size=0.2)\n",
    "x_train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 3.特征工程(字典特征抽取)\n",
    "# 3.1 创建字典特征提取转换器\n",
    "transfer = DictVectorizer()\n",
    "\n",
    "# 3.2 将特征值转换成字典，因为使用的是字典特征提取转换器，所以要将数据字典化\n",
    "x_train = x_train.to_dict(orient=\"records\")\n",
    "x_test = x_test.to_dict(orient=\"records\")\n",
    "\n",
    "# 3.3 标准化\n",
    "x_train = transfer.fit_transform(x_train)\n",
    "x_test = transfer.fit_transform(x_test)\n",
    "# x_test, 现在这是一个对象\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "GridSearchCV(cv=5, estimator=RandomForestClassifier(),\n",
       "             param_grid={'max_depth': [5, 8, 15, 25, 30],\n",
       "                         'n_estimators': [120, 200, 300, 500, 800, 1200]})"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 4. 机器学习\n",
    "# 4.1 创建随机森林估计其\n",
    "estimator = RandomForestClassifier()\n",
    "\n",
    "# 4.2 网格搜索交叉验证\n",
    "param_grid = {\"n_estimators\": [120,200,300,500,800,1200], \"max_depth\": [5, 8, 15, 25, 30]}\n",
    "estimator = GridSearchCV(estimator=estimator, param_grid=param_grid, cv=5)\n",
    "\n",
    "# 4.3 模型驯良\n",
    "estimator.fit(x_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.7908745247148289"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 5. 模型评估\n",
    "# 5.1 模型评分\n",
    "estimator.score(x_test, y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "RandomForestClassifier(max_depth=5, n_estimators=500)"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 5.2 最好的模型\n",
    "estimator.best_estimator_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "\n",
    "\n",
    "\n",
    "\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
